In [1]:
# Libraries for parsing data
import os
import pandas as pd
import xml.etree.ElementTree as ET
from lxml import etree
from bs4 import BeautifulSoup
import re
import numpy as np
In [2]:
path_dropbox = "D:/Dropbox/Research/China Foreign Share Discount"

parse calls and save to a pickle¶

In [3]:
def callExtract(path, filename):
    tree = ET.parse(os.path.join(path, filename))
    root = tree.getroot()
    
    # from attributes
    eventId = root.attrib['Id']
    eventTypeId = root.attrib['eventTypeId']
    eventTypeName = root.attrib['eventTypeName']
    
    # from children's text
    eventTitle = root.find('eventTitle').text
    city = root.find('city').text
    companyName = root.find('companyName').text
    Ticker = root.find('companyTicker').text
    Date = root.find('startDate').text
    companyId = root.find('companyId').text
    CUSIP = root.find('CUSIP').text
    SEDOL = root.find('SEDOL').text
    ISIN = root.find('ISIN').text
    
    # participant list
    text = root[0][1].text
    participantPattern = re.search('(Conference Call Participants\n=+)(\n.*?)(\n\n=+)', text, flags = re.DOTALL)
    if participantPattern:
        participantText = participantPattern.group(2)
        participant = participantText.split('\n   * ')[1:]
    else:
        participant = []
    
    return [filename, eventId, eventTypeId, eventTypeName, eventTitle, city, companyName, Ticker, Date, companyId, CUSIP, SEDOL, ISIN, participant]
In [4]:
df = pd.DataFrame([], columns = ['filename', 'eventId', 'eventTypeId', 'eventTypeName', 'eventTitle', 'city', 'companyName', 'Ticker', 'Date', 'companyId', 'CUSIP', 'SEDOL', 'ISIN', 'participants'])
for year in range(2001, 2024):
    path = "E:/Transcripts/" + str(year)
    for filename in os.listdir(path):
        row = callExtract(path, filename)
        df.loc[len(df.index)] = row
    print(str(year) + ' is done!!!')
2001 is done!!!
2002 is done!!!
2003 is done!!!
2004 is done!!!
2005 is done!!!
2006 is done!!!
2007 is done!!!
2008 is done!!!
2009 is done!!!
2010 is done!!!
2011 is done!!!
2012 is done!!!
2013 is done!!!
2014 is done!!!
2015 is done!!!
2016 is done!!!
2017 is done!!!
2018 is done!!!
2019 is done!!!
2020 is done!!!
2021 is done!!!
2022 is done!!!
2023 is done!!!
In [5]:
import pickle
with open(path_dropbox + '/Conference Call Transcript/transcript.pkl', 'wb') as file: 
    pickle.dump(df, file)